# Author: Stephen Situ
# This is an exercise for practicing the ETL (Extract-Transform-Load) process for data integration. We have
# CSV, json, and xml files in our directory. By defining a function for extracting each type, and then running a
# final extract function that loops through all the files, we can create a pandas dataframe that holds the
# combined data. Additionally, we create a timestamped logfile to track these processes.
# Libraries
import glob # For selecting files in directory
import pandas as pd # For reading csv files
import xml.etree.ElementTree as ET # For reading xml files
from datetime import datetime # For Getting date/time
# Set Paths
tmpfile = "dealership_temp.tmp" # temp file used to store all extracted data
logfile = "dealership_logfile.txt" # Log file
targetfile = "dealership_transformed_data.csv" # CSV file to store all all data
# Define function to read CSV and return the dataframe
def extract_from_csv(file_to_process):
dataframe = pd.read_csv(file_to_process)
return dataframe
# Define function to read CSV and return dataframe
def extract_from_json(file_to_process):
dataframe = pd.read_json(file_to_process,lines=True)
return dataframe
# Definte function to read xml file and append values to dataframe
def extract_from_xml(file_to_process):
dataframe = pd.DataFrame(columns=["car_model", "year_of_manufacture", "price", "fuel"])
tree = ET.parse(file_to_process)
root = tree.getroot()
for row in root:
car_model = row.find("car_model").text
year_of_manufacture = row.find("year_of_manufacture").text
price = float(row.find("price").text)
fuel = row.find("fuel").text
dataframe = dataframe.append({"car_model": car_model, "year_of_manufacture":year_of_manufacture, "price": price,"fuel":fuel}, ignore_index=True)
return dataframe
# Define function to
def extract():
extracted_data = pd.DataFrame(columns=['car_model','year_of_manufacture','price','fuel']) # create an empty data frame to hold extracted data
# Loop through all csv files in directory, * indicates all files
for csvfile in glob.glob("*.csv"):
extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True)
# Loop through all json files in directroy, * star indicates all files
for jsonfile in glob.glob("*.json"):
extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True)
# Loop through all xml files in directory, * star indicates all files
for xmlfile in glob.glob("*.xml"):
extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)
return extracted_data
# Define transformation function to round price to 2 digits
def transform(data):# Add the transform function below
data['price'] = round(data.price,2)
return data
# Define load functions to write into CSV file
def load(targetfile,data_to_load):
data_to_load.to_csv(targetfile)
# Define log function to write timesteamp in to Log file
# with open, opens logfile.txt and appends
def log(message):
timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second
now = datetime.now() # get current timestamp
timestamp = now.strftime(timestamp_format)
with open("logfile.txt","a") as f:
f.write(timestamp + ',' + message + '\n')
# Start ETL Log
log("ETL Job Started")
# Extract Phase
log("Extract phase Started")
extracted_data = extract()
log("Extract phase Ended")
extracted_data
C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:8: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:8: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:8: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. extracted_data = extracted_data.append(extract_from_csv(csvfile), ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:12: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:12: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:12: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. extracted_data = extracted_data.append(extract_from_json(jsonfile), ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\3203211828.py:11: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. dataframe = dataframe.append({"car_model": car_model, "year_of_manufacture":year_of_manufacture, "price": price,"fuel":fuel}, ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:16: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\3203211828.py:11: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. dataframe = dataframe.append({"car_model": car_model, "year_of_manufacture":year_of_manufacture, "price": price,"fuel":fuel}, ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:16: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\3203211828.py:11: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. dataframe = dataframe.append({"car_model": car_model, "year_of_manufacture":year_of_manufacture, "price": price,"fuel":fuel}, ignore_index=True) C:\Users\Steve\AppData\Local\Temp\ipykernel_1456\2660047292.py:16: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. extracted_data = extracted_data.append(extract_from_xml(xmlfile), ignore_index=True)
car_model | year_of_manufacture | price | fuel | |
---|---|---|---|---|
0 | ritz | 2014 | 5000.000000 | Petrol |
1 | sx4 | 2013 | 7089.552239 | Diesel |
2 | ciaz | 2017 | 10820.895522 | Petrol |
3 | wagon r | 2011 | 4253.731343 | Petrol |
4 | swift | 2014 | 6865.671642 | Diesel |
... | ... | ... | ... | ... |
58 | etios g | 2014 | 6119.402985 | Petrol |
59 | fortuner | 2014 | 29835.820896 | Diesel |
60 | corolla altis | 2013 | 10373.134328 | Petrol |
61 | etios liva | 2014 | 5895.522388 | Diesel |
62 | corolla altis | 2016 | 21985.074627 | Diesel |
63 rows × 4 columns
# Transform phase
log("Transform phase Started")
transformed_data = transform(extracted_data)
log("Transform phase Ended")
transformed_data
car_model | year_of_manufacture | price | fuel | |
---|---|---|---|---|
0 | ritz | 2014 | 5000.00 | Petrol |
1 | sx4 | 2013 | 7089.55 | Diesel |
2 | ciaz | 2017 | 10820.90 | Petrol |
3 | wagon r | 2011 | 4253.73 | Petrol |
4 | swift | 2014 | 6865.67 | Diesel |
... | ... | ... | ... | ... |
58 | etios g | 2014 | 6119.40 | Petrol |
59 | fortuner | 2014 | 29835.82 | Diesel |
60 | corolla altis | 2013 | 10373.13 | Petrol |
61 | etios liva | 2014 | 5895.52 | Diesel |
62 | corolla altis | 2016 | 21985.07 | Diesel |
63 rows × 4 columns
# load phase
log("Load phase Started")
load(targetfile,transformed_data)
log("Load phase Ended")
# End ETL
log("ETL Job Ended")
# Check Dataframe contents
Finished_df = pd.read_csv('dealership_transformed_data.csv')
Finished_df
Unnamed: 0 | car_model | year_of_manufacture | price | fuel | |
---|---|---|---|---|---|
0 | 0 | ritz | 2014 | 5000.00 | Petrol |
1 | 1 | sx4 | 2013 | 7089.55 | Diesel |
2 | 2 | ciaz | 2017 | 10820.90 | Petrol |
3 | 3 | wagon r | 2011 | 4253.73 | Petrol |
4 | 4 | swift | 2014 | 6865.67 | Diesel |
... | ... | ... | ... | ... | ... |
58 | 58 | etios g | 2014 | 6119.40 | Petrol |
59 | 59 | fortuner | 2014 | 29835.82 | Diesel |
60 | 60 | corolla altis | 2013 | 10373.13 | Petrol |
61 | 61 | etios liva | 2014 | 5895.52 | Diesel |
62 | 62 | corolla altis | 2016 | 21985.07 | Diesel |
63 rows × 5 columns
# Check Logfile
with open('logfile.txt') as f:
contents = f.read()
print(contents)
2022-Nov-28-17:26:59,ETL Job Started 2022-Nov-28-17:27:01,Extract phase Started 2022-Nov-28-17:27:01,Extract phase Ended 2022-Nov-28-17:27:04,Transform phase Started 2022-Nov-28-17:27:04,Transform phase Ended 2022-Nov-28-17:27:06,Load phase Started 2022-Nov-28-17:27:06,Load phase Ended 2022-Nov-28-17:27:08,ETL Job Ended